CS 584-Theory and Applications of Data Mining

Time Series Forecasting Project

GROUP MEMBERS:¶

Name-----GNumber¶

Tejas Ramesh-----G01445777¶

Sahil Shrikrishna Zele-----G01465963¶

Saksham Nayyar-----G01462522¶

Swapneel Suhas Vaidya-----G01459609¶

Importing Necessary libraries¶

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import calendar
from shapely.geometry import Point
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from pylab import rcParams
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import sys
import itertools
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from matplotlib import pyplot as plt
%matplotlib inline

Reading the Dataset¶

In [2]:
df = pd.read_csv('Baltimore_Crime_Data.csv')
In [3]:
df.head()
Out[3]:
CrimeDate CrimeTime CrimeCode Location Description Inside/Outside Weapon Post District Neighborhood Location 1 Total Incidents
0 11/12/2016 02:35:00 3B 300 SAINT PAUL PL ROBBERY - STREET O NaN 111.0 CENTRAL Downtown (39.2924100000, -76.6140800000) 1
1 11/12/2016 02:56:00 3CF 800 S BROADWAY ROBBERY - COMMERCIAL I FIREARM 213.0 SOUTHEASTERN Fells Point (39.2824200000, -76.5928800000) 1
2 11/12/2016 03:00:00 6D 1500 PENTWOOD RD LARCENY FROM AUTO O NaN 413.0 NORTHEASTERN Stonewood-Pentwood-Winston (39.3480500000, -76.5883400000) 1
3 11/12/2016 03:00:00 6D 6600 MILTON LN LARCENY FROM AUTO O NaN 424.0 NORTHEASTERN Westfield (39.3626300000, -76.5516100000) 1
4 11/12/2016 03:00:00 6E 300 W BALTIMORE ST LARCENY O NaN 111.0 CENTRAL Downtown (39.2893800000, -76.6197100000) 1

Data Pre-processing¶

In [4]:
df['District'].nunique()
Out[4]:
13
In [5]:
df['CrimeDate'] = df["CrimeDate"].str.cat(df["CrimeTime"], sep = " ")
In [6]:
df['CrimeDate'].loc[198664]=pd.to_datetime('09/24/2012 00:00:00')
C:\Users\tejas\AppData\Local\Temp\ipykernel_9520\2896351813.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CrimeDate'].loc[198664]=pd.to_datetime('09/24/2012 00:00:00')
In [7]:
df[['Latitude', 'Longitude']] =df['Location 1'].str.strip('()').str.split(',', expand=True)
In [8]:
df['Latitude'] = df['Latitude'].astype(float)
df['Longitude'] = df['Longitude'].astype(float)
In [9]:
df['CrimeDate'] = pd.to_datetime(df['CrimeDate'])
In [10]:
df['CrimeDate'].dt.year.unique()
Out[10]:
array([2016, 2015, 2014, 2013, 2012, 2011], dtype=int64)
In [11]:
df.dtypes
Out[11]:
CrimeDate          datetime64[ns]
CrimeTime                  object
CrimeCode                  object
Location                   object
Description                object
Inside/Outside             object
Weapon                     object
Post                      float64
District                   object
Neighborhood               object
Location 1                 object
Total Incidents             int64
Latitude                  float64
Longitude                 float64
dtype: object
In [12]:
for i in range(0,len(df)):
    if df['Inside/Outside'].iloc[i]=='I':
        continue
    elif df['Inside/Outside'].iloc[i]=='O':
        continue
    elif df['Inside/Outside'].iloc[i]=='Inside':
        df['Inside/Outside'].iloc[i]='I'
    elif df['Inside/Outside'].iloc[i]=='Outside':
        df['Inside/Outside'].iloc[i]='O'
        
C:\Users\tejas\AppData\Local\Temp\ipykernel_9520\1713855148.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Inside/Outside'].iloc[i]='O'
C:\Users\tejas\AppData\Local\Temp\ipykernel_9520\1713855148.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Inside/Outside'].iloc[i]='I'
In [13]:
df['Inside/Outside'].unique()
Out[13]:
array(['O', 'I', nan], dtype=object)
In [14]:
df.shape
Out[14]:
(285807, 14)
In [15]:
df.describe()
Out[15]:
Post Total Incidents Latitude Longitude
count 285616.000000 285807.0 284188.000000 284188.000000
mean 504.234184 1.0 39.308704 -76.617269
std 261.354783 0.0 0.061588 0.042107
min 0.000000 1.0 39.200410 -76.711440
25% 242.000000 1.0 39.288380 -76.648020
50% 445.000000 1.0 39.303690 -76.613960
75% 723.000000 1.0 39.327570 -76.587600
max 945.000000 1.0 41.629730 -76.517840
In [16]:
rcParams['figure.figsize'] = 12, 8
df['Year'] = df['CrimeDate'].dt.year
complaints_by_year = df.groupby('Year')['Total Incidents'].sum()

complaints_by_year.plot(kind='line', figsize=(12, 8))
plt.title('Complaints by Year')
plt.xlabel('Year')
plt.ylabel('Total Crimes')
plt.show()
No description has been provided for this image
In [17]:
df.isna().sum()
Out[17]:
CrimeDate               0
CrimeTime               0
CrimeCode               0
Location             1623
Description             0
Inside/Outside       4196
Weapon             188411
Post                  191
District               58
Neighborhood         1701
Location 1           1619
Total Incidents         0
Latitude             1619
Longitude            1619
Year                    0
dtype: int64
In [18]:
column_to_check = ['Latitude','Longitude']
df.dropna(subset=column_to_check, inplace=True)
In [19]:
df= df.fillna('NA')
In [20]:
df.isna().sum()
Out[20]:
CrimeDate          0
CrimeTime          0
CrimeCode          0
Location           0
Description        0
Inside/Outside     0
Weapon             0
Post               0
District           0
Neighborhood       0
Location 1         0
Total Incidents    0
Latitude           0
Longitude          0
Year               0
dtype: int64
In [21]:
df['Total Incidents'].isnull().sum()
Out[21]:
0
In [22]:
df.shape
Out[22]:
(284188, 15)
In [23]:
# Extract year and month into new columns
df['Year'] = df['CrimeDate'].dt.year
df['Month'] = df['CrimeDate'].dt.month
# Group by year and month, and count the incidents
monthly_agg = df.groupby(['Latitude','Longitude']).size().reset_index(name='Incident Count')

print(monthly_agg)
       Latitude  Longitude  Incident Count
0      39.20041  -76.55602               6
1      39.20047  -76.55605               7
2      39.20155  -76.55021               1
3      39.20196  -76.55686               2
4      39.20208  -76.55695               1
...         ...        ...             ...
97946  41.62362  -76.52606               1
97947  41.62429  -76.52516               1
97948  41.62513  -76.52402               1
97949  41.62711  -76.52136               1
97950  41.62973  -76.51784               1

[97951 rows x 3 columns]
In [24]:
district_crimes = df.groupby(['Year','District']).size().reset_index(name='Incident Count')
district_crimes = district_crimes.dropna(subset=['District'])
dist= pd.pivot_table(district_crimes, values = "Incident Count", columns = "District", index = "Year")
dist
Out[24]:
District CENTRAL EASTERN Gay Street NA NORTHEASTERN NORTHERN NORTHESTERN NORTHWESTERN SOUTHEASTERN SOUTHERN SOUTHESTERN SOUTHWESTERN WESTERN
Year
2011 6290.0 4348.0 NaN NaN 8434.0 5067.0 8.0 4876.0 6719.0 6052.0 1.0 4491.0 4068.0
2012 6494.0 4227.0 NaN 1.0 7684.0 5468.0 5.0 4854.0 6555.0 5775.0 2.0 4418.0 3825.0
2013 5618.0 4122.0 NaN NaN 7788.0 5652.0 3.0 5272.0 6950.0 5576.0 2.0 4248.0 4055.0
2014 5022.0 3648.0 NaN NaN 7479.0 5232.0 2.0 4687.0 6398.0 5188.0 NaN 4353.0 3729.0
2015 5310.0 4087.0 NaN NaN 7661.0 5850.0 9.0 4727.0 7050.0 5180.0 2.0 4571.0 3978.0
2016 4885.0 3610.0 1.0 NaN 5786.0 4457.0 9.0 4150.0 5573.0 5075.0 3.0 4019.0 3509.0

Visualizing Crimes by Location of Occurence (Inside/Outside)¶

In [25]:
label_counts = df['Inside/Outside'].value_counts()

# Create a pie chart.
plt.figure(figsize=(6, 6))
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Crimes by Location (Inside/Outside)')

# Display the pie chart.
plt.show()
No description has been provided for this image
In [26]:
def create_point(monthly_agg):
    return Point(monthly_agg['Longitude'], monthly_agg['Latitude'])
monthly_agg['geometry'] = monthly_agg.apply(create_point, axis=1)
In [28]:
with open('gz_2010_us_040_00_500k.json', 'rb') as file:
    data = file.read()
    if data.startswith(b'StartingBytes'):
        gdf_us_states = gpd.read_file('gz_2010_us_040_00_500k.json')
    else:
        gdf_us_states = gpd.read_file('gz_2010_us_040_00_500k.json')
In [29]:
us_map = folium.Map(location=[37.0902, -95.7129], zoom_start=4)
folium.GeoJson(gdf_us_states).add_to(us_map)
Out[29]:
<folium.features.GeoJson at 0x2b34851ddd0>
In [30]:
# Create a MarkerCluster for efficient marker rendering
marker_cluster = MarkerCluster().add_to(us_map)

for idx, row in monthly_agg.iterrows():
    incident_count = row['Incident Count']
    latitude = row['Latitude']
    longitude = row['Longitude']

    folium.Marker([latitude, longitude], tooltip=f'{incident_count} incidents').add_to(marker_cluster)
In [31]:
us_map.save('us_map.html') 
In [130]:
us_map
Out[130]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [32]:
monthly_agg_2 = df.groupby(['Year','Month']).size().reset_index(name='Incident Count')
print(monthly_agg_2)
    Year  Month  Incident Count
0   2011      1            3427
1   2011      2            3091
2   2011      3            4233
3   2011      4            4130
4   2011      5            4496
..   ...    ...             ...
66  2016      7            4069
67  2016      8            4214
68  2016      9            4421
69  2016     10            4648
70  2016     11            1518

[71 rows x 3 columns]

Decompose the timeseries Additively¶

In [33]:
# Performing seasonal decomposition
df_add_decompose = seasonal_decompose(monthly_agg_2['Incident Count'], model='additive', period=12)

# Creating a custom plot
fig, axes = plt.subplots(4, 1, figsize=(12, 8), sharex=True)

# Plotting the original time series
axes[0].set_title('Original Time Series')
axes[0].plot(df_add_decompose.observed, label='Observed', color='blue')
axes[0].legend(loc='upper left')

# Plotting the trend component
axes[1].set_title('Trend Component')
axes[1].plot(df_add_decompose.trend, label='Trend', color='red')
axes[1].legend(loc='upper left')

# Plotting the seasonal component
axes[2].set_title('Seasonal Component')
axes[2].plot(df_add_decompose.seasonal, label='Seasonal', color='green')
axes[2].legend(loc='upper left')

# Plotting the residual component
axes[3].set_title('Residual Component')
axes[3].plot(df_add_decompose.resid, label='Residual', color='purple')
axes[3].legend(loc='upper left')

# Customizing x-axis labels to show the year
plt.xticks(range(0, len(monthly_agg_2), 12), monthly_agg_2['Year'].unique())
plt.tight_layout()
plt.show()
No description has been provided for this image

Decompose the timeseries multiplicatively¶

In [34]:
# Performing seasonal decomposition
df_multi_decompose = seasonal_decompose(monthly_agg_2['Incident Count'], model='multiplicative', period=12)

# Creating a custom plot
fig, axes = plt.subplots(4, 1, figsize=(12, 8), sharex=True)

# Plotting the original time series
axes[0].set_title('Original Time Series')
axes[0].plot(df_multi_decompose.observed, label='Observed', color='blue')
axes[0].legend(loc='upper left')

# Plotting the trend component
axes[1].set_title('Trend Component')
axes[1].plot(df_multi_decompose.trend, label='Trend', color='red')
axes[1].legend(loc='upper left')

# Plotting the seasonal component
axes[2].set_title('Seasonal Component')
axes[2].plot(df_multi_decompose.seasonal, label='Seasonal', color='green')
axes[2].legend(loc='upper left')

# Plotting the residual component
axes[3].set_title('Residual Component')
axes[3].plot(df_multi_decompose.resid, label='Residual', color='purple')
axes[3].legend(loc='upper left')

# Customizing x-axis labels to show the year
plt.xticks(range(0, len(monthly_agg_2), 12), monthly_agg_2['Year'].unique())
plt.tight_layout()
plt.show()
No description has been provided for this image

Checking the residual plots¶

In [35]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)

# Plotting Additive Decomposition
ax1.set_title('Additive Decomposition')
df_add_decompose.resid.plot(ax=ax1, color='green')

# Plotting Multiplicative Decomposition
ax2.set_title('Multiplicative Decomposition')
df_multi_decompose.resid.plot(ax=ax2, color='red')

# Customizing x-axis labels to show the year from the original DataFrame index
x_labels = monthly_agg_2['Year'].unique()
ax1.set_xticks(range(0, len(monthly_agg_2), 12))
ax1.set_xticklabels(x_labels, rotation=45)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [36]:
monthly_agg_2['Month'] = monthly_agg_2['Month'].apply(lambda x: calendar.month_abbr[x])
monthly_agg_2.head()
Out[36]:
Year Month Incident Count
0 2011 Jan 3427
1 2011 Feb 3091
2 2011 Mar 4233
3 2011 Apr 4130
4 2011 May 4496
In [37]:
monthly_crimes= pd.pivot_table(monthly_agg_2, values = "Incident Count", columns = "Year", index = "Month")
monthly_crimes
Out[37]:
Year 2011 2012 2013 2014 2015 2016
Month
Apr 4130.0 3961.0 3964.0 3736.0 3882.0 3817.0
Aug 4539.0 4491.0 4597.0 4105.0 4614.0 4214.0
Dec 4046.0 3869.0 4143.0 3689.0 3997.0 NaN
Feb 3091.0 3341.0 3180.0 2878.0 2637.0 2847.0
Jan 3427.0 3814.0 3797.0 3636.0 3606.0 3359.0
Jul 4643.0 4381.0 4460.0 4171.0 4749.0 4069.0
Jun 4404.0 4281.0 4392.0 3999.0 4613.0 4393.0
Mar 4233.0 4269.0 3615.0 3431.0 3338.0 3597.0
May 4496.0 4623.0 4424.0 4149.0 4397.0 4194.0
Nov 4385.0 3885.0 3937.0 3711.0 4030.0 1518.0
Oct 4618.0 4205.0 4583.0 4194.0 4187.0 4648.0
Sep 4342.0 4188.0 4194.0 4039.0 4375.0 4421.0
In [38]:
monthly_crimes = monthly_crimes.reindex(index = ['Jan','Feb','Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
monthly_crimes
Out[38]:
Year 2011 2012 2013 2014 2015 2016
Month
Jan 3427.0 3814.0 3797.0 3636.0 3606.0 3359.0
Feb 3091.0 3341.0 3180.0 2878.0 2637.0 2847.0
Mar 4233.0 4269.0 3615.0 3431.0 3338.0 3597.0
Apr 4130.0 3961.0 3964.0 3736.0 3882.0 3817.0
May 4496.0 4623.0 4424.0 4149.0 4397.0 4194.0
Jun 4404.0 4281.0 4392.0 3999.0 4613.0 4393.0
Jul 4643.0 4381.0 4460.0 4171.0 4749.0 4069.0
Aug 4539.0 4491.0 4597.0 4105.0 4614.0 4214.0
Sep 4342.0 4188.0 4194.0 4039.0 4375.0 4421.0
Oct 4618.0 4205.0 4583.0 4194.0 4187.0 4648.0
Nov 4385.0 3885.0 3937.0 3711.0 4030.0 1518.0
Dec 4046.0 3869.0 4143.0 3689.0 3997.0 NaN

We Notice Data is Missing for part of November and entire December 2016¶

In [39]:
monthly_crimes.plot();
No description has been provided for this image
In [40]:
monthly_crimes.boxplot(color='Blue')
Out[40]:
<Axes: >
No description has been provided for this image

Splitting the data into train and test data¶

In [41]:
df=df.reset_index()
In [42]:
df
Out[42]:
index CrimeDate CrimeTime CrimeCode Location Description Inside/Outside Weapon Post District Neighborhood Location 1 Total Incidents Latitude Longitude Year Month
0 0 2016-11-12 02:35:00 02:35:00 3B 300 SAINT PAUL PL ROBBERY - STREET O NA 111.0 CENTRAL Downtown (39.2924100000, -76.6140800000) 1 39.29241 -76.61408 2016 11
1 1 2016-11-12 02:56:00 02:56:00 3CF 800 S BROADWAY ROBBERY - COMMERCIAL I FIREARM 213.0 SOUTHEASTERN Fells Point (39.2824200000, -76.5928800000) 1 39.28242 -76.59288 2016 11
2 2 2016-11-12 03:00:00 03:00:00 6D 1500 PENTWOOD RD LARCENY FROM AUTO O NA 413.0 NORTHEASTERN Stonewood-Pentwood-Winston (39.3480500000, -76.5883400000) 1 39.34805 -76.58834 2016 11
3 3 2016-11-12 03:00:00 03:00:00 6D 6600 MILTON LN LARCENY FROM AUTO O NA 424.0 NORTHEASTERN Westfield (39.3626300000, -76.5516100000) 1 39.36263 -76.55161 2016 11
4 4 2016-11-12 03:00:00 03:00:00 6E 300 W BALTIMORE ST LARCENY O NA 111.0 CENTRAL Downtown (39.2893800000, -76.6197100000) 1 39.28938 -76.61971 2016 11
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
284183 285802 2011-01-01 22:15:00 22:15:00 4D 6800 MCCLEAN BD AGG. ASSAULT I HANDS 423.0 NORTHEASTERN Hamilton Hills (39.3704700000, -76.5670500000) 1 39.37047 -76.56705 2011 1
284184 285803 2011-01-01 22:30:00 22:30:00 6J 3000 ODONNELL ST LARCENY I NA 232.0 SOUTHEASTERN Canton (39.2804600000, -76.5727300000) 1 39.28046 -76.57273 2011 1
284185 285804 2011-01-01 23:00:00 23:00:00 7A 2500 ARUNAH AV AUTO THEFT O NA 721.0 WESTERN Evergreen Lawn (39.2954200000, -76.6592800000) 1 39.29542 -76.65928 2011 1
284186 285805 2011-01-01 23:25:00 23:25:00 4E 100 N MONROE ST COMMON ASSAULT I HANDS 714.0 WESTERN Penrose/Fayette Street Outreach (39.2899900000, -76.6470700000) 1 39.28999 -76.64707 2011 1
284187 285806 2011-01-01 23:38:00 23:38:00 4D 800 N FREMONT AV AGG. ASSAULT I HANDS 123.0 WESTERN Upton (39.2981200000, -76.6339100000) 1 39.29812 -76.63391 2011 1

284188 rows × 17 columns

In [43]:
df.set_index('CrimeDate', inplace=True)
weekly_crime_data = df['Total Incidents'].resample('W').sum()
print(weekly_crime_data)
CrimeDate
2011-01-02     287
2011-01-09     807
2011-01-16     751
2011-01-23     813
2011-01-30     675
              ... 
2016-10-16    1031
2016-10-23    1105
2016-10-30    1063
2016-11-06    1022
2016-11-13     662
Freq: W-SUN, Name: Total Incidents, Length: 307, dtype: int64
In [44]:
weekly_crime_data=weekly_crime_data.to_frame()
weekly_crime_data=weekly_crime_data.reset_index()
weekly_crime_data.rename(columns={'CrimeDate': 'Week'}, inplace=True)
weekly_crime_data.set_index('Week',inplace=True)
In [45]:
weekly_crime_data
Out[45]:
Total Incidents
Week
2011-01-02 287
2011-01-09 807
2011-01-16 751
2011-01-23 813
2011-01-30 675
... ...
2016-10-16 1031
2016-10-23 1105
2016-10-30 1063
2016-11-06 1022
2016-11-13 662

307 rows × 1 columns

Function to Update November and December 2016 crimes by taking Average of historical Data¶

In [46]:
historical_mean = weekly_crime_data.loc['2011':'2015'].resample('W').mean()
historical_mean_by_week = historical_mean.groupby(historical_mean.index.week).mean()

# Range of missing weeks from '2016-11-06' to '2016-12-25'
missing_weeks = pd.date_range(start='2016-11-20', end='2016-12-25', freq='W')

# Filling missing values in 2016 based on the mean of corresponding weeks from previous years
for week in missing_weeks:
    week_number = week.week
    if week_number in historical_mean_by_week.index:
        mean_value = historical_mean_by_week.loc[week_number]['Total Incidents']
        weekly_crime_data.loc[week] = mean_value

weekly_crime_data = weekly_crime_data.sort_index()
C:\Users\tejas\AppData\Local\Temp\ipykernel_9520\2089400844.py:8: FutureWarning: weekofyear and week have been deprecated, please use DatetimeIndex.isocalendar().week instead, which returns a Series. To exactly reproduce the behavior of week and weekofyear and return an Index, you may call pd.Int64Index(idx.isocalendar().week)
  historical_mean_by_week = historical_mean.groupby(historical_mean.index.week).mean()
In [47]:
weekly_crime_data.tail(10)
Out[47]:
Total Incidents
Week
2016-10-23 1105.0
2016-10-30 1063.0
2016-11-06 1022.0
2016-11-13 662.0
2016-11-20 908.2
2016-11-27 916.2
2016-12-04 908.4
2016-12-11 882.4
2016-12-18 909.0
2016-12-25 938.0
In [48]:
train= weekly_crime_data[:'2015-08-30']
test= weekly_crime_data['2015-09-06':'2016-12-25']
In [49]:
print('First few rows of Training Data','\n',train.head(5),'\n')
print('Last few rows of Training Data','\n',train.tail(5),'\n')
print('First few rows of Test Data','\n',test.head(5),'\n')
print('Last few rows of Test Data','\n',test.tail(5),'\n')
First few rows of Training Data 
             Total Incidents
Week                       
2011-01-02            287.0
2011-01-09            807.0
2011-01-16            751.0
2011-01-23            813.0
2011-01-30            675.0 

Last few rows of Training Data 
             Total Incidents
Week                       
2015-08-02           1093.0
2015-08-09           1036.0
2015-08-16            973.0
2015-08-23           1066.0
2015-08-30           1053.0 

First few rows of Test Data 
             Total Incidents
Week                       
2015-09-06            999.0
2015-09-13           1051.0
2015-09-20           1048.0
2015-09-27           1011.0
2015-10-04            902.0 

Last few rows of Test Data 
             Total Incidents
Week                       
2016-11-27            916.2
2016-12-04            908.4
2016-12-11            882.4
2016-12-18            909.0
2016-12-25            938.0 

In [50]:
print(train.shape)
print(test.shape)
(244, 1)
(69, 1)
In [51]:
train['Total Incidents'].plot(figsize=(13,5), fontsize=14,color='Green')
test['Total Incidents'].plot(figsize=(13,5), fontsize=14,color='Red')
plt.grid()
plt.legend(['Training Data','Test Data'])
plt.show()
No description has been provided for this image
In [52]:
train_time = [i+1 for i in range(len(train))]
test_time = [i+134 for i in range(len(test))]
print('Training Time instance','\n',train_time)
print('Test Time instance','\n',test_time)
Training Time instance 
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244]
Test Time instance 
 [134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202]

Modelling¶

Linear Regression¶

In [53]:
LinearRegression_train = train.copy()
LinearRegression_test = test.copy()
In [54]:
LinearRegression_train['time'] = train_time
LinearRegression_test['time'] = test_time
In [55]:
lr = LinearRegression()
In [56]:
lr.fit(LinearRegression_train[['time']],LinearRegression_train['Total Incidents'].values)
Out[56]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [57]:
test_predictions_model1= lr.predict(LinearRegression_test[['time']])
LinearRegression_test['RegOnTime'] = test_predictions_model1

Evaluate this model on the test data using Root Mean Squared Error (RMSE)¶

In [59]:
rmse_model1_test = metrics.mean_squared_error(test['Total Incidents'],test_predictions_model1,squared=False)
print("For RegressionOnTime forecast on the Test Data,  RMSE is %3.3f" %(rmse_model1_test))
For RegressionOnTime forecast on the Test Data,  RMSE is 114.539
In [60]:
resultsDf = pd.DataFrame({'Test RMSE': [rmse_model1_test]},index=['RegressionOnTime'])
resultsDf
Out[60]:
Test RMSE
RegressionOnTime 114.539289

Exponential Smoothing Technique¶

Simple Exponential Smoothing¶

Building a Simple Exponential Smoothing model by using the parameter 'optimise=True' within the '.fit()' function. Then,evaluate the same model on the test set using RMSE.¶

In [62]:
SES_train = train.copy()
SES_test = test.copy()
In [63]:
model_SES = SimpleExpSmoothing(SES_train['Total Incidents'])
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency W-SUN will be used.
  self._init_dates(dates, freq)
In [64]:
model_SES_autofit = model_SES.fit(optimized=True,use_brute=True)
In [65]:
model_SES_autofit.params
Out[65]:
{'smoothing_level': 0.6567422871887287,
 'smoothing_trend': nan,
 'smoothing_seasonal': nan,
 'damping_trend': nan,
 'initial_level': 287.0,
 'initial_trend': nan,
 'initial_seasons': array([], dtype=float64),
 'use_boxcox': False,
 'lamda': None,
 'remove_bias': False}
In [66]:
test_predictions_model2 = model_SES_autofit.forecast(steps=len(test))
test_predictions_model2
Out[66]:
2015-09-06    1049.822454
2015-09-13    1049.822454
2015-09-20    1049.822454
2015-09-27    1049.822454
2015-10-04    1049.822454
                 ...     
2016-11-27    1049.822454
2016-12-04    1049.822454
2016-12-11    1049.822454
2016-12-18    1049.822454
2016-12-25    1049.822454
Freq: W-SUN, Length: 69, dtype: float64

Test Data - RMSE¶

In [67]:
rmse_model2_test = metrics.mean_squared_error(test['Total Incidents'],test_predictions_model2,squared=False)
print("For RegressionOnTime forecast on the Test Data,  RMSE is %3.3f" %(rmse_model2_test))
For RegressionOnTime forecast on the Test Data,  RMSE is 174.858
In [68]:
temp_resultsDf = pd.DataFrame({'Test RMSE': [rmse_model2_test]}
                              ,index=['Alpha=0.0976:SimpleExponentialSmoothing'])

resultsDf = pd.concat([resultsDf, temp_resultsDf])
resultsDf
Out[68]:
Test RMSE
RegressionOnTime 114.539289
Alpha=0.0976:SimpleExponentialSmoothing 174.858279

Compared to SES, Linear Regression is showing better RMSE for test data

Double Exponential Smoothing¶

Build a Double Exponential Smoothing model by using the parameter 'optimise=True' within the '.fit()' function. Then,evaluate the same model on the test set using RMSE.¶

In [69]:
DES_train = train.copy()
DES_test = test.copy()
In [70]:
model_DES = Holt(SES_train['Total Incidents'])
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency W-SUN will be used.
  self._init_dates(dates, freq)
In [71]:
model_DES_autofit = model_DES.fit(optimized=True,use_brute=True)
In [72]:
model_DES_autofit.params_formatted
Out[72]:
name param optimized
smoothing_level alpha 0.734319 True
smoothing_trend beta 0.443518 True
initial_level l.0 287.000000 False
initial_trend b.0 520.000000 False
In [73]:
test_predictions_model3 = model_DES_autofit.forecast(steps=len(test))
test_predictions_model3
Out[73]:
2015-09-06    1054.392034
2015-09-13    1060.062062
2015-09-20    1065.732091
2015-09-27    1071.402119
2015-10-04    1077.072148
                 ...     
2016-11-27    1417.273854
2016-12-04    1422.943883
2016-12-11    1428.613911
2016-12-18    1434.283940
2016-12-25    1439.953968
Freq: W-SUN, Length: 69, dtype: float64

Test Data - RMSE¶

In [74]:
rmse_model3_test = metrics.mean_squared_error(test['Total Incidents'],test_predictions_model3,squared=False)
print("For RegressionOnTime forecast on the Test Data,  RMSE is %3.3f" %(rmse_model3_test))
For RegressionOnTime forecast on the Test Data,  RMSE is 360.706
In [75]:
temp_resultsDf = pd.DataFrame({'Test RMSE': [rmse_model3_test]}
                              ,index=['Alpha=1.490116e-08,Beta=3.694809e-16:DoubleExponentialSmoothing'])

resultsDf = pd.concat([resultsDf, temp_resultsDf])
resultsDf
Out[75]:
Test RMSE
RegressionOnTime 114.539289
Alpha=0.0976:SimpleExponentialSmoothing 174.858279
Alpha=1.490116e-08,Beta=3.694809e-16:DoubleExponentialSmoothing 360.705570

Triple Exponential Smoothing¶

In [76]:
TES_train = train.copy()
TES_test = test.copy()
In [77]:
model_TES = ExponentialSmoothing(TES_train['Total Incidents'],trend='multiplicative',seasonal='multiplicative',)
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency W-SUN will be used.
  self._init_dates(dates, freq)
In [78]:
model_TES_autofit = model_TES.fit(optimized=True,use_brute=True)
In [79]:
model_TES_autofit.params
Out[79]:
{'smoothing_level': 0.2525,
 'smoothing_trend': 0.0001,
 'smoothing_seasonal': 0.0001,
 'damping_trend': nan,
 'initial_level': 961.6243589743584,
 'initial_trend': 1.001732511895604,
 'initial_seasons': array([0.90967812, 0.88364294, 0.89707828, 0.93219462, 0.8869683 ,
        0.89219784, 0.82004037, 0.80195965, 0.81988594, 0.86147559,
        0.83711081, 0.97091752, 0.92985983, 0.92082561, 0.91811072,
        0.9813616 , 1.00112623, 1.01694243, 1.00046211, 1.04751437,
        1.10692915, 1.11244399, 1.05176743, 1.04814546, 1.05590356,
        1.08976004, 1.11151645, 1.01906403, 1.04752315, 1.07137783,
        1.11351715, 1.12296536, 1.06241094, 1.10280727, 1.05027149,
        1.04146096, 1.01332453, 1.01478471, 1.04327886, 1.09601043,
        1.06749581, 1.07081863, 1.08917799, 1.07709044, 1.04984822,
        1.06636488, 0.98257513, 0.94802268, 0.99575982, 0.9650088 ,
        0.96373058, 1.01949137]),
 'use_boxcox': False,
 'lamda': None,
 'remove_bias': False}
In [80]:
test_predictions_model4 = model_TES_autofit.forecast(steps=len(test))
test_predictions_model4
Out[80]:
2015-09-06    1003.473011
2015-09-13    1006.609271
2015-09-20    1036.636727
2015-09-27    1090.878215
2015-10-04    1064.304603
                 ...     
2016-11-27    1099.180767
2016-12-04    1067.049931
2016-12-11    1067.445501
2016-12-18    1131.120345
2016-12-25    1010.931192
Freq: W-SUN, Length: 69, dtype: float64

Test Data - RMSE¶

In [81]:
rmse_model4_test = metrics.mean_squared_error(test['Total Incidents'],test_predictions_model4,squared=False)
print("For RegressionOnTime forecast on the Test Data,  RMSE is %3.3f" %(rmse_model4_test))
For RegressionOnTime forecast on the Test Data,  RMSE is 162.265
In [82]:
temp_resultsDf = pd.DataFrame({'Test RMSE': [rmse_model4_test]}
                              ,index=['Alpha=0.07575,Beta=0.0541,Gamma=0.4107:TripleExponentialSmoothing'])

resultsDf = pd.concat([resultsDf, temp_resultsDf])
resultsDf
Out[82]:
Test RMSE
RegressionOnTime 114.539289
Alpha=0.0976:SimpleExponentialSmoothing 174.858279
Alpha=1.490116e-08,Beta=3.694809e-16:DoubleExponentialSmoothing 360.705570
Alpha=0.07575,Beta=0.0541,Gamma=0.4107:TripleExponentialSmoothing 162.265218

Naive Model¶

In [83]:
NaiveModel_train = train.copy()
NaiveModel_test = test.copy()
In [84]:
NaiveModel_test['naive'] = np.asarray(train['Total Incidents'])[len(np.asarray(train['Total Incidents']))-1]
NaiveModel_test['naive'].head()
Out[84]:
Week
2015-09-06    1053.0
2015-09-13    1053.0
2015-09-20    1053.0
2015-09-27    1053.0
2015-10-04    1053.0
Name: naive, dtype: float64
In [85]:
plt.subplots(figsize=(14,6))

plt.plot(NaiveModel_train['Total Incidents'], label='Train',color='Green')
plt.plot(test['Total Incidents'], label='Test',color='Red')

plt.plot(NaiveModel_test['naive'], label='Naive Forecast on Test Data')

plt.legend(loc='best')
plt.title("Naive Forecast")
plt.grid();
No description has been provided for this image
In [86]:
rmse_model2_test = metrics.mean_squared_error(test['Total Incidents'],NaiveModel_test['naive'],squared=False)
print("For Naive Bayes forecast on the Test Data,  RMSE is %3.3f" %(rmse_model2_test))
For Naive Bayes forecast on the Test Data,  RMSE is 177.281
In [87]:
resultsDf_2 = pd.DataFrame({'Test RMSE': [rmse_model2_test]},index=['NaiveModel'])

resultsDf = pd.concat([resultsDf, resultsDf_2])
resultsDf
Out[87]:
Test RMSE
RegressionOnTime 114.539289
Alpha=0.0976:SimpleExponentialSmoothing 174.858279
Alpha=1.490116e-08,Beta=3.694809e-16:DoubleExponentialSmoothing 360.705570
Alpha=0.07575,Beta=0.0541,Gamma=0.4107:TripleExponentialSmoothing 162.265218
NaiveModel 177.280822

Simple Average¶

In [88]:
SimpleAverage_train = train.copy()
SimpleAverage_test = test.copy()
In [89]:
SimpleAverage_test['mean_forecast'] = train['Total Incidents'].mean()
SimpleAverage_test.head()
Out[89]:
Total Incidents mean_forecast
Week
2015-09-06 999.0 927.72541
2015-09-13 1051.0 927.72541
2015-09-20 1048.0 927.72541
2015-09-27 1011.0 927.72541
2015-10-04 902.0 927.72541
In [90]:
plt.subplots(figsize=(14,6))

plt.plot(SimpleAverage_train['Total Incidents'], label='Train',color='Green')
plt.plot(SimpleAverage_test['Total Incidents'], label='Test',color='Red')

plt.plot(SimpleAverage_test['Total Incidents'], label='Simple Average on Test Data')

plt.legend(loc='best')
plt.title("Simple Average Forecast")
plt.grid();
No description has been provided for this image
In [91]:
rmse_model3_test = metrics.mean_squared_error(test['Total Incidents'],SimpleAverage_test['mean_forecast'],squared=False)
print("For Simple Average forecast on the Test Data,  RMSE is %3.3f" %(rmse_model3_test))
For Simple Average forecast on the Test Data,  RMSE is 114.420
In [92]:
resultsDf_3 = pd.DataFrame({'Test RMSE': [rmse_model3_test]}
                           ,index=['SimpleAverageModel'])

resultsDf = pd.concat([resultsDf, resultsDf_3])
resultsDf
Out[92]:
Test RMSE
RegressionOnTime 114.539289
Alpha=0.0976:SimpleExponentialSmoothing 174.858279
Alpha=1.490116e-08,Beta=3.694809e-16:DoubleExponentialSmoothing 360.705570
Alpha=0.07575,Beta=0.0541,Gamma=0.4107:TripleExponentialSmoothing 162.265218
NaiveModel 177.280822
SimpleAverageModel 114.419510

Moving Average(MA)¶

In [93]:
MovingAverage = weekly_crime_data.copy()
MovingAverage.head()
Out[93]:
Total Incidents
Week
2011-01-02 287.0
2011-01-09 807.0
2011-01-16 751.0
2011-01-23 813.0
2011-01-30 675.0
In [94]:
MovingAverage['Trailing_3'] = MovingAverage['Total Incidents'].rolling(3).mean()
MovingAverage['Trailing_6'] = MovingAverage['Total Incidents'].rolling(6).mean()
MovingAverage['Trailing_9'] = MovingAverage['Total Incidents'].rolling(9).mean()
MovingAverage['Trailing_12'] = MovingAverage['Total Incidents'].rolling(12).mean()

MovingAverage.head()
Out[94]:
Total Incidents Trailing_3 Trailing_6 Trailing_9 Trailing_12
Week
2011-01-02 287.0 NaN NaN NaN NaN
2011-01-09 807.0 NaN NaN NaN NaN
2011-01-16 751.0 615.000000 NaN NaN NaN
2011-01-23 813.0 790.333333 NaN NaN NaN
2011-01-30 675.0 746.333333 NaN NaN NaN
In [95]:
MovingAverage.shape
Out[95]:
(313, 5)
In [96]:
#plotting the data
plt.plot(MovingAverage['Total Incidents'], label='Train')
plt.plot(MovingAverage['Trailing_3'], label='3 Point Moving Average')
plt.plot(MovingAverage['Trailing_6'], label='6 Point Moving Average')
plt.plot(MovingAverage['Trailing_9'],label = '9 Point Moving Average')
plt.plot(MovingAverage['Trailing_12'],label = '12 Point Moving Average')

plt.legend(loc = 'best')
plt.grid();
No description has been provided for this image

Creating train and test set¶

In [97]:
trailing_MovingAverage_train=MovingAverage[0:len(train)] 
trailing_MovingAverage_test=MovingAverage[len(train):]
In [98]:
trailing_MovingAverage_train.head()
Out[98]:
Total Incidents Trailing_3 Trailing_6 Trailing_9 Trailing_12
Week
2011-01-02 287.0 NaN NaN NaN NaN
2011-01-09 807.0 NaN NaN NaN NaN
2011-01-16 751.0 615.000000 NaN NaN NaN
2011-01-23 813.0 790.333333 NaN NaN NaN
2011-01-30 675.0 746.333333 NaN NaN NaN

Plotting on both the Training and Test data¶

In [99]:
fig, axes = plt.subplots(2, 2, sharey=False, sharex=False)
fig.set_figwidth(14)
fig.set_figheight(8)

axes[0][0].plot(trailing_MovingAverage_train['Total Incidents'], label='Train',color='Green')
axes[0][0].plot(trailing_MovingAverage_test['Total Incidents'], label='Test',color='Red')

axes[0][0].plot(trailing_MovingAverage_test['Trailing_3'], label='3 Point Trailing Moving Average on Test Set')
axes[0][0].set_title("3-Months Moving Average")
axes[0][0].legend(loc='best')

axes[0][1].plot(trailing_MovingAverage_train['Total Incidents'], label='Train',color='Green')
axes[0][1].plot(trailing_MovingAverage_test['Total Incidents'], label='Test',color='Red')

axes[0][1].plot(trailing_MovingAverage_test['Trailing_6'], label='6 Point Trailing Moving Average on Test Set')
axes[0][1].set_title("6-Months Moving Average")
axes[0][1].legend(loc='best')

axes[1][0].plot(trailing_MovingAverage_train['Total Incidents'], label='Train',color='Green')
axes[1][0].plot(trailing_MovingAverage_test['Total Incidents'], label='Test',color='Red')

axes[1][0].plot(trailing_MovingAverage_test['Trailing_9'],label = '9 Point Trailing Moving Average on Test Set')
axes[1][0].set_title("9-Months Moving Average")
axes[1][0].legend(loc='best')

axes[1][1].plot(trailing_MovingAverage_train['Total Incidents'], label='Train',color='Green')
axes[1][1].plot(trailing_MovingAverage_test['Total Incidents'], label='Test',color='Red')

axes[1][1].plot(trailing_MovingAverage_test['Trailing_12'],label = '12 Point Trailing Moving Average on Test Set')
axes[1][1].set_title("12-Months Moving Average")
axes[1][1].legend(loc='best')

plt.tight_layout()
No description has been provided for this image
In [100]:
rmse_model4_test_3 = metrics.mean_squared_error(test['Total Incidents'],trailing_MovingAverage_test['Trailing_3'],squared=False)
print("For 3 point Moving Average Model forecast on the Training Data,  RMSE is %3.3f" %(rmse_model4_test_3))


rmse_model4_test_6 = metrics.mean_squared_error(test['Total Incidents'],trailing_MovingAverage_test['Trailing_6'],squared=False)
print("For 6 point Moving Average Model forecast on the Training Data,  RMSE is %3.3f" %(rmse_model4_test_6))


rmse_model4_test_9 = metrics.mean_squared_error(test['Total Incidents'],trailing_MovingAverage_test['Trailing_9'],squared=False)
print("For 9 point Moving Average Model forecast on the Training Data,  RMSE is %3.3f" %(rmse_model4_test_9))


rmse_model4_test_12 = metrics.mean_squared_error(test['Total Incidents'],trailing_MovingAverage_test['Trailing_12'],squared=False)
print("For 12 point Moving Average Model forecast on the Training Data,  RMSE is %3.3f" %(rmse_model4_test_12))
For 3 point Moving Average Model forecast on the Training Data,  RMSE is 57.399
For 6 point Moving Average Model forecast on the Training Data,  RMSE is 75.151
For 9 point Moving Average Model forecast on the Training Data,  RMSE is 88.332
For 12 point Moving Average Model forecast on the Training Data,  RMSE is 98.691
In [101]:
resultsDf_4 = pd.DataFrame({'Test RMSE': [rmse_model4_test_3,rmse_model4_test_6
                                          ,rmse_model4_test_9,rmse_model4_test_12]}
                           ,index=['3pointTrailingMovingAverage','6pointTrailingMovingAverage'
                                   ,'9pointTrailingMovingAverage','12pointTrailingMovingAverage'])

resultsDf = pd.concat([resultsDf, resultsDf_4])
resultsDf
Out[101]:
Test RMSE
RegressionOnTime 114.539289
Alpha=0.0976:SimpleExponentialSmoothing 174.858279
Alpha=1.490116e-08,Beta=3.694809e-16:DoubleExponentialSmoothing 360.705570
Alpha=0.07575,Beta=0.0541,Gamma=0.4107:TripleExponentialSmoothing 162.265218
NaiveModel 177.280822
SimpleAverageModel 114.419510
3pointTrailingMovingAverage 57.398674
6pointTrailingMovingAverage 75.150899
9pointTrailingMovingAverage 88.331849
12pointTrailingMovingAverage 98.690973
In [102]:
plt.subplots(figsize = (14,6))

plt.plot(train['Total Incidents'], label='Train',color='Green')
plt.plot(test['Total Incidents'], label='Test',color='Red')

plt.plot(LinearRegression_test['RegOnTime'], label='Linear Regression on test data',color='Purple')

plt.plot(NaiveModel_test['naive'], label='Naive Forecast on Test Data',color='Blue')

plt.plot(SimpleAverage_test['mean_forecast'], label='Simple Average on Test Data')

plt.plot(trailing_MovingAverage_test['Trailing_3'], label='3 Point Trailing Moving Average on Training Set',color='Orange')


plt.legend(loc='best')
plt.title("Model Comparison Plots")
plt.grid();
No description has been provided for this image

ARIMA Models¶

In [104]:
dftest = adfuller(weekly_crime_data['Total Incidents'])
dftest
print('DF test statistic is %3.3f' %dftest[0])
print('DF test p-value is %1.4f' %dftest[1])
DF test statistic is -4.367
DF test p-value is 0.0003
In [105]:
plt.figure(figsize=(12, 8))
plt.plot(weekly_crime_data['Total Incidents'].diff(periods=1),color='Green')
plt.xlabel('Years')
plt.ylabel('Incidents');
No description has been provided for this image

There is seasonality present in the series

In [106]:
plt.figure(figsize=(12, 8))
plt.plot(np.log10(weekly_crime_data['Total Incidents']),color='Red')
plt.xlabel('Years')
plt.ylabel('Incidents');
No description has been provided for this image
In [107]:
crimes = weekly_crime_data['Total Incidents']
plt.figure(figsize=(10, 5))
plt.plot(np.log10(crimes).diff(periods=1),color='Blue')
plt.xlabel('Years')
plt.ylabel('Differenced Log (Incidents)');
No description has been provided for this image
In [108]:
crimes_log = np.log10(crimes)
crimes_log.dropna(inplace=True)

crimes_log_diff = crimes_log.diff(periods=1) 
crimes_log_diff.dropna(inplace=True)
In [109]:
fig, axes = plt.subplots(1, 2)
fig.set_figwidth(12)
fig.set_figheight(4)
smt.graphics.plot_acf(crimes_log, lags=30, ax=axes[0])
smt.graphics.plot_pacf(crimes_log, lags=30, ax=axes[1])
plt.tight_layout()
No description has been provided for this image
In [110]:
fig, axes = plt.subplots(1, 2)
fig.set_figwidth(12)
fig.set_figheight(4)
plt.xticks(range(0,30,1), rotation = 90)
smt.graphics.plot_acf(crimes_log_diff, lags=30, ax=axes[0])
smt.graphics.plot_pacf(crimes_log_diff, lags=30, ax=axes[1])
plt.tight_layout()
No description has been provided for this image
In [111]:
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

Separating data into train and test¶

In [112]:
weekly_crime_data['Week'] = weekly_crime_data.index
train = weekly_crime_data[weekly_crime_data.index < '2015-08-30']
test = weekly_crime_data[weekly_crime_data.index >= '2015-09-06']
train_crimes_log = np.log10(train['Total Incidents'])

Predict Crimes using ARIMA Model¶

In [115]:
best_model = sm.tsa.statespace.SARIMAX(crimes_log,
                                      order=(1, 1, 0),
                                      seasonal_order=(0, 1, 0, 12),
                                      enforce_stationarity=True)
best_results = best_model.fit()
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency W-SUN will be used.
  self._init_dates(dates, freq)
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency W-SUN will be used.
  self._init_dates(dates, freq)
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
In [116]:
print(best_results.summary().tables[0])
print(best_results.summary().tables[1])
                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:                    Total Incidents   No. Observations:                  313
Model:             SARIMAX(1, 1, 0)x(0, 1, 0, 12)   Log Likelihood                 433.085
Date:                            Sun, 26 Nov 2023   AIC                           -862.171
Time:                                    20:24:30   BIC                           -854.763
Sample:                                01-02-2011   HQIC                          -859.206
                                     - 12-25-2016                                         
Covariance Type:                              opg                                         
==========================================================================================
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.3086      0.041     -7.459      0.000      -0.390      -0.227
sigma2         0.0033      0.000     27.559      0.000       0.003       0.003
==============================================================================
In [117]:
pred_dynamic = best_results.get_prediction(start='2015-09-06', dynamic=True, full_results=True)
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\representation.py:374: FutureWarning: Unknown keyword arguments: dict_keys(['full_results']).Passing unknown keyword arguments will raise a TypeError beginning in version 0.15.
  warnings.warn(msg, FutureWarning)
In [118]:
pred_dynamic_ci = pred_dynamic.conf_int()
In [119]:
pred99 = best_results.get_forecast(steps=52,alpha=0.05)
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\representation.py:374: FutureWarning: Unknown keyword arguments: dict_keys(['alpha']).Passing unknown keyword arguments will raise a TypeError beginning in version 0.15.
  warnings.warn(msg, FutureWarning)
In [120]:
print(pred99.predicted_mean)
2017-01-01    2.962235
2017-01-08    2.979506
2017-01-15    3.007703
2017-01-22    2.991462
2017-01-29    2.974198
2017-02-05    2.785661
2017-02-12    2.922967
2017-02-19    2.926782
2017-02-26    2.923067
2017-03-05    2.910456
2017-03-12    2.923354
2017-03-19    2.936993
2017-03-26    2.927025
2017-04-02    2.944296
2017-04-09    2.972493
2017-04-16    2.956252
2017-04-23    2.938988
2017-04-30    2.750451
2017-05-07    2.887758
2017-05-14    2.891572
2017-05-21    2.887857
2017-05-28    2.875246
2017-06-04    2.888144
2017-06-11    2.901783
2017-06-18    2.891815
2017-06-25    2.909086
2017-07-02    2.937283
2017-07-09    2.921042
2017-07-16    2.903778
2017-07-23    2.715241
2017-07-30    2.852548
2017-08-06    2.856362
2017-08-13    2.852647
2017-08-20    2.840036
2017-08-27    2.852934
2017-09-03    2.866573
2017-09-10    2.856605
2017-09-17    2.873876
2017-09-24    2.902073
2017-10-01    2.885832
2017-10-08    2.868568
2017-10-15    2.680032
2017-10-22    2.817338
2017-10-29    2.821152
2017-11-05    2.817437
2017-11-12    2.804826
2017-11-19    2.817724
2017-11-26    2.831363
2017-12-03    2.821395
2017-12-10    2.838666
2017-12-17    2.866863
2017-12-24    2.850622
Freq: W-SUN, Name: predicted_mean, dtype: float64

Extracting the predicted and true values of our time series¶

In [121]:
crimes_forecasted = pred_dynamic.predicted_mean
testCopy = test.copy()
#changing the scale of the logarithmic scale to the original scale by raising the predicted values to the power of 10
testCopy['Total Crimes'] = np.power(10,crimes_forecasted)
In [122]:
testCopy.head()
Out[122]:
Total Incidents Week Total Crimes
Week
2015-09-06 999.0 2015-09-06 1078.341724
2015-09-13 1051.0 2015-09-13 1024.808035
2015-09-20 1048.0 2015-09-20 1062.424502
2015-09-27 1011.0 2015-09-27 1036.470653
2015-10-04 902.0 2015-10-04 1064.726950
In [123]:
mse = ((testCopy['Total Incidents'] - testCopy['Total Crimes']) ** 2).mean()
rmse = np.sqrt(mse)
print('The Root Mean Squared Error of our forecasts is {}'.format(round(rmse, 3)))
The Root Mean Squared Error of our forecasts is 147.416
In [124]:
resultsDf_5 = pd.DataFrame({'Test RMSE': [round(rmse, 3)]}
                           ,index=['ARIMA'])

resultsDf = pd.concat([resultsDf, resultsDf_5])
resultsDf
Out[124]:
Test RMSE
RegressionOnTime 114.539289
Alpha=0.0976:SimpleExponentialSmoothing 174.858279
Alpha=1.490116e-08,Beta=3.694809e-16:DoubleExponentialSmoothing 360.705570
Alpha=0.07575,Beta=0.0541,Gamma=0.4107:TripleExponentialSmoothing 162.265218
NaiveModel 177.280822
SimpleAverageModel 114.419510
3pointTrailingMovingAverage 57.398674
6pointTrailingMovingAverage 75.150899
9pointTrailingMovingAverage 88.331849
12pointTrailingMovingAverage 98.690973
ARIMA 147.416000
In [125]:
axis = train['Total Incidents'].plot(label='Total Incidents', figsize=(10, 6))
testCopy['Total Incidents'].plot(ax=axis, label='Total Incidents', alpha=0.7)
testCopy['Total Crimes'].plot(ax=axis, label='forecasted', alpha=0.7)
axis.set_xlabel('Years')
axis.set_ylabel('Crimes')
plt.legend(loc='best')
plt.show()
plt.close()
No description has been provided for this image

Get forecast 52 steps (1 years) ahead in future¶

In [126]:
n_steps = 52
pred_uc_99 = best_results.get_forecast(steps=n_steps, alpha=0.01) 
pred_uc_95 = best_results.get_forecast(steps=n_steps, alpha=0.05) 
pred_ci_99 = pred_uc_99.conf_int()
pred_ci_95 = pred_uc_95.conf_int()
C:\Users\tejas\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\representation.py:374: FutureWarning: Unknown keyword arguments: dict_keys(['alpha']).Passing unknown keyword arguments will raise a TypeError beginning in version 0.15.
  warnings.warn(msg, FutureWarning)
In [127]:
idx = pd.date_range(weekly_crime_data.index[-1], periods=n_steps, freq='W')

fc_95 = pd.DataFrame(np.column_stack([np.power(10, pred_uc_95.predicted_mean), np.power(10, pred_ci_95)]), 
                     index=idx, columns=['forecast', 'lower_ci_95', 'upper_ci_95'])

fc_99 = pd.DataFrame(np.column_stack([np.power(10, pred_ci_99)]),
                     index=idx, columns=['lower_ci_99', 'upper_ci_99'])



fc_all = fc_95.combine_first(fc_99)

fc_all = fc_all[['forecast', 'lower_ci_95', 'upper_ci_95']] 


fc_all.head(12)
Out[127]:
forecast lower_ci_95 upper_ci_95
2016-12-25 916.715762 708.423753 1186.250157
2017-01-01 953.906075 697.286190 1304.968910
2017-01-08 1017.894612 700.815944 1478.433032
2017-01-15 980.532638 643.481362 1494.129140
2017-01-22 942.319485 591.774475 1500.514213
2017-01-29 610.465709 368.369925 1011.668860
2017-02-05 837.466608 486.959745 1440.263445
2017-02-12 844.853918 474.517433 1504.219006
2017-02-19 837.658143 455.338288 1540.988719
2017-02-26 813.683850 428.790233 1544.068305
2017-03-05 838.212096 428.832946 1638.399109
2017-03-12 864.953832 430.146577 1739.279519
In [128]:
start_date=weekly_crime_data['Week'].iloc[0]
end_date=fc_all.index[-1]
axis = crimes.plot(label='Observed', figsize=(8, 4))
axis.set_xlim([start_date, end_date])  # Define your start_date and end_date
axis.set_ylim(0, 2000) 
fc_all['forecast'].plot(ax=axis, label='Forecast', alpha=0.7)
axis.fill_between(fc_all.index, fc_all['lower_ci_95'], fc_all['upper_ci_95'], color='k', alpha=.15)
axis.set_xlabel('Years')
axis.set_ylabel('Crimes')
plt.legend(loc='best')
plt.show()
No description has been provided for this image
In [129]:
best_results.plot_diagnostics(lags=30, figsize=(10,8))
plt.show()
No description has been provided for this image